import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import r_regression, chi2, SelectKBest
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
def is_number(num_str):
"""
Determines whether a given string input represents
a number.
"""
try:
float(num_str)
return True
except ValueError:
return False
# Vectorised versions of is_number function
vec_isalpha = np.vectorize(lambda x: not is_number(x))
vec_isnum = np.vectorize(lambda x: is_number(x))
def vec_isalpha():
return np.vectorize(lambda x: not is_number(x))
def vec_isnum():
return np.vectorize(lambda x: is_number(x))
def plot_bar_data(*bars, x=None, title="", x_label="", y_label=""):
"""
Generic function for creating a bar plot.
"""
fig = go.Figure(
layout={
"title": title,
"xaxis": {"title": x_label},
"yaxis": {"title": y_label},
"barmode": "group"
}, data=[
go.Bar(name=f"{bar[0]}", x=x, y=bar[1])
for bar in bars
])
return fig
class DatasetManager():
def __init__(self, ds_name):
self._original_ds = None
self._clean_ds = None
self._encoded_ds = None
self._encodings = None
self._numerised_ds = None
self._complete_ds = None
self._feat_ds = None
self._feat_cols = None
self._scores = None
self._scaled_feat_ds = None
self._ds_name = ds_name
def get_ds_name(self):
return self._ds_name
def get_original_ds(self):
return self._original_ds
def get_clean_ds(self):
return self._clean_ds
def get_encoded_ds(self):
return self._encoded_ds
def get_encodings(self):
return self._encodings
def get_numerised_ds(self):
return self._numerised_ds
def get_complete_ds(self):
return self._complete_ds
def get_feat_ds(self):
return self._feat_ds
def get_feat_ds_cols_and_scores(self):
return self._feat_cols, self._scores
def get_scaled_feat_ds(self):
return self._scaled_feat_ds
def load_and_preprocess(self, crit_cols, imp_choice):
"""
Function for loading and cleaning dataset as well as
encoding non-numerical values and imputing missing values.
"""
# Load dataset
self.load_dataset(self._ds_name)
print("Dataset loaded...")
# Clean dataset
self.clean_dataset(crit_cols)
print("Dataset cleaned..")
# Encode dataset
self.encode_dataset()
print("Dataset encodings..")
# Numerise dataset
self.numerise_dataset()
print("Dataset numerised...")
# Impute missing values
self.impute_dataset(imp_choice)
print("Missing values imputed...")
def load_dataset(self, ds_name):
"""
Loads the dataset from csv file into a numpy array.
"""
# Load dataset
ds = np.genfromtxt(
f"{ds_name}.csv",
delimiter=",",
skip_header=True,
dtype="str"
)
# Strip whitespace from start and end of all elements
self._original_ds = np.char.strip(ds)
def clean_dataset(self, check_cols=None):
"""
Filters dataset by removing rows that have missing
values in at least 1 of a specified set of columns.
"""
ds = np.copy(self._original_ds)
# Create a mask to find rows with missing values in critical columns
if not check_cols:
missing_rows_mask = np.any(ds == "", axis=1)
else:
missing_rows_mask = np.any(ds[:, check_cols] == "", axis=1)
# Remove rows with missing values using boolean indexing
self._clean_ds = ds[~missing_rows_mask]
def encode_dataset(self):
"""
Encodes non-numerical columns in the dataset.
"""
self._encoded_ds = np.copy(self._clean_ds)
self._encodings = {}
# Get number of columns in dataset
num_cols = self._encoded_ds.shape[1]
# Loop through columns to see if any need to be encoded
for i in range(num_cols):
if np.all(vec_isalpha()(self._encoded_ds[:, i])):
# Initialise encoder
oec = OrdinalEncoder(categories="auto", dtype=float)
# Fit encoder
oec.fit(self._encoded_ds[:, [i]])
# Replace columns
self._encoded_ds[:, i] = oec.transform(self._encoded_ds[:, [i]]).flatten()
# Save category encoding
self._encodings[i] = oec.categories_[0]
def numerise_dataset(self):
"""
Converts all elements in the dataset from
strings to numbers.
"""
self._numerised_ds = np.where(
self._encoded_ds == "",
np.nan,
self._encoded_ds
).astype(float)
def impute_dataset(self, imp_choice="simple"):
"""
Fills in missing values in the dataset using
a specified imputation method.
"""
complete_ds = np.copy(self._numerised_ds)
# Initialise imputer
if imp_choice == "simple":
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
elif imp_choice == "knn":
imputer = KNNImputer(n_neighbors=2)
elif imp_choice == "iterative":
imputer = IterativeImputer(random_state=0)
# Fit imputer and fill missing values
self._complete_ds = imputer.fit_transform(complete_ds)
def create_feature_set(self, n_features):
"""
Creates a feature set consisting of n features.
Features are found using Pearson correlation.
"""
ds = np.copy(self._complete_ds)
# Selecting features and scores
feat_selector = SelectKBest(r_regression, k=n_features)
self._feat_ds = feat_selector.fit_transform(ds[:, :-1], ds[:, -1])
self._scores = feat_selector.scores_
self._feat_cols = np.sort(np.argsort(self._scores)[-n_features:])
def scale_feature_set(self):
"""
Scales the feature set.
"""
feat_ds = np.copy(self._feat_ds)
# Initialise scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
# Apply scaler
self._scaled_feat_ds = scaler.fit_transform(feat_ds)
def test_feature_number(
self,
model_type="clf",
corr_type="pearson",
test_size=0.2,
chart_title="Model Accuracy vs Number of Features"
):
"""
Creates a visualisation of the performance of a basic
model that's trained on different feature sets created
from the main dataset.
"""
ds = np.copy(self._complete_ds)
# Get number features and samples
n_samples = min(10000, ds.shape[0])
n_features = ds.shape[1]
# Using pearson correlation to evaluate different feature sets
x = []
train_acc_vals = ("Training Accuracy", [])
test_acc_vals = ("Testing Accuracy", [])
avg_vals = ("Weighted Average Across Both Sets", [])
for i in range(1, n_features):
x.append(f"n = {i}")
# Selecting features
feat_selector = SelectKBest(r_regression, k=i)
feat_ds = feat_selector.fit_transform(ds[:n_samples, :-1], ds[:n_samples, -1])
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(feat_ds, ds[:n_samples, -1], test_size=test_size)
# Normalising training data
scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_norm = scaler.fit_transform(X_train)
# Model initialisation
if model_type == "clf": # classification model
model = SVC()
elif model_type == "reg": # regression model
model = SVR()
# Train model using the training data
model.fit(X_train_norm, y_train)
# Get training accuracy and add it to list
train_acc = model.score(X_train_norm, y_train)
train_acc_vals[1].append(train_acc)
# Get testing accuracy and add it to list
X_test_norm = scaler.transform(X_test)
test_acc = model.score(X_test_norm, y_test)
test_acc_vals[1].append(test_acc)
avg_vals[1].append(np.average([train_acc, test_acc], weights=[1-test_size,test_size]))
# Get averages across all feature numbers
x.append("Average")
train_acc_vals[1].append(np.mean(train_acc_vals[1]))
test_acc_vals[1].append(np.mean(test_acc_vals[1]))
avg_vals[1].append(np.mean(avg_vals[1]))
# Create visualisation of model performance as function of feature number
performance_data = plot_bar_data(
train_acc_vals,
test_acc_vals,
avg_vals,
x=x,
title=chart_title,
x_label="Number of Features (n)",
y_label="Model Accuracy"
)
return performance_data
# GWP Dataset
gwp_dsm = DatasetManager("gwp_assessment")
# Star Dataset
star_dsm = DatasetManager("star_assessment")
Running a basic model on different feature sets in order to assess how different imputation methods affect accuracy.
# Loading and preprocessing
gwp_dsm.load_and_preprocess([0,1,2,3], "simple") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "simple") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed... Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed...
# Performance data for GWP dataset
chart_title = "Model Accuracy vs Number of Features (Simple Imputation Method)"
gwp_dsm.test_feature_number("reg", chart_title=chart_title)
# Performance data for Star dataset
chart_title = "Model Accuracy vs Number of Features (Simple Imputation Method)"
star_dsm.test_feature_number("clf", chart_title=chart_title)
# Loading and preprocessing
gwp_dsm.load_and_preprocess([0,1,2,3], "knn") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed... Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed...
# Performance data for GWP dataset
chart_title = "Model Accuracy vs Number of Features (KNN Imputation Method)"
gwp_dsm.test_feature_number("reg", chart_title=chart_title)
# Performance data for Star dataset
chart_title = "Model Accuracy vs Number of Features (KNN Imputation Method)"
star_dsm.test_feature_number("clf", chart_title=chart_title)
# Loading and preprocessing
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "iterative") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed... Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed...
# Performance data for GWP dataset
chart_title = "Model Accuracy vs Number of Features (Iterative Imputation Method)"
gwp_dsm.test_feature_number("reg", chart_title=chart_title)
# Performance data for Star dataset
chart_title = "Model Accuracy vs Number of Features (Iterative Imputation Method)"
star_dsm.test_feature_number("clf", chart_title=chart_title)
GWP Dataset
The simple imputation method yields the highest average training accuracy but the iterative method results in a higher average testing accuracy and weighted average between training and testing accuracy. Therefore, the optimal imputation method is the iterative method.
Star Dataset
The KNN imputation method has the slight edge over the other two methods in terms of average training accuracy and the weighted average between average training accuracy and average testing accuracy. Therefore the optimial imputation method is the KNN method.
# Loading and preprocessing with optimal imputation methods
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed... Dataset loaded... Dataset cleaned.. Dataset encodings.. Dataset numerised... Missing values imputed...
# Performance data for GWP dataset
gwp_dsm.test_feature_number("reg")
# Performance data for Star dataset
star_dsm.test_feature_number("clf")
GWP Dataset
Training and testing accuracy steadily increase until n = 7. At this point the two metrics begin to diverge, which would suggest that any feature set with more that 7 features would quickly lead to overfitting. Therefore, the optimal number of features is 7.
Star Dataset
Training and testing accuracy increase until n = 8. At this point both accuracy metrics decrease until n = 11, where there is a sudden spike; this would suggest that overfitting sets in at n = 11. Therefore, the optimal number of features is 8.
# Creating feature sets of the optimal size for each dataset
gwp_dsm.create_feature_set(7)
star_dsm.create_feature_set(8)
# Scaling each feature set
gwp_dsm.scale_feature_set()
star_dsm.scale_feature_set()
# Final scaled GWP dataset
gwp_dsm.get_scaled_feat_ds()
array([[-1. , 0.2 , 1. , ..., -0.06349206,
-0.94555556, 0.31034483],
[-1. , 0.2 , 0.8630137 , ..., -0.87301587,
-1. , -0.86206897],
[-1. , 0.2 , 1. , ..., -0.51587302,
-0.97222222, -0.34482759],
...,
[ 0.75862069, 1. , 0.7260274 , ..., -0.87301587,
-1. , -0.86206897],
[ 0.75862069, 1. , 0.8630137 , ..., -0.76190476,
-1. , -0.70114943],
[ 0.75862069, 1. , 0.7260274 , ..., -0.9047619 ,
-1. , -0.90804598]])
# Feature colums for scaled GWP dataset
feat_cols, _ = gwp_dsm.get_feat_ds_cols_and_scores()
feat_cols
array([ 0, 3, 5, 7, 8, 9, 13])
# Final scaled Star dataset
star_dsm.get_scaled_feat_ds()
array([[ 0.00760492, -0.14467077, 0.99788799, ..., -0.09682969,
-0.09681622, 0.29601311],
[-0.01637576, 0.03197251, 0.9984505 , ..., 0.65766595,
0.6576826 , 0.78864009],
[ 0.06827764, -0.12854131, 0.99791881, ..., -0.29810758,
-0.29810276, 0.08793009],
...,
[-0.32238054, -0.2748241 , 0.9976005 , ..., -0.59320128,
-0.59319274, -0.20070999],
[ 0.28594879, -0.15289164, 0.99785431, ..., 0.05609863,
0.05610292, 0.29983616],
[ 0.34105031, -0.07022863, 0.99798512, ..., 0.16343213,
0.16342317, 0.50081922]])
# Feature colums for scaled Star dataset
feat_cols, _ = star_dsm.get_feat_ds_cols_and_scores()
feat_cols
array([ 2, 6, 7, 9, 10, 12, 14, 15])
The Pearson correlation coefficient measures the linear relationship between two variables. It helps identify relevant features by quantifying their linear relationship with the target variable, and it detects redundancy among features by comparing their pairwise correlations. The strength and direction of the correlation guide the selection of features with the most predictive power. The key benefit of Pearson correlation is its simplicity and ease of interpretation. Pearson correlation is a simple and effective way to identify relevant and non-redundant features for a model. Pearson correlation helps select features that have strong predictive power while minimising the risk of multicollinearity. This yields more efficient and accurate models by reducing the number of features used without sacrificing valuable information content. Its ease of calculation and interpretation of coefficients make it a popular choice for machine learning practitioners, as it allows for straightforward comparison and ranking of features based on their relationship with the target variable.